In [5]:
# libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
import openpyxl

# libraries for data manipulation and model building
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor  # used for imputing missing values
from sklearn.model_selection import KFold,train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures

# from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.preprocessing import OneHotEncoder


# import warnings
# import pandas_access as mdb
# warnings.filterwarnings('ignore')
# %matplotlib inline

This project features the 2023 national data of the YouthRisk Behavior Surveillance System (YRBSS) survey conducted by the Centers for Disease Control and Prevention (CDC). It is developed in 1990 to monitor health risk behaviors among high school students in the United States and has provided insights into the leading causes of death, disability, and social problems among adolescents. Alongside with the student demographics (sex, sexual identity, race and ethnicity, and grade), a total of 107 survey questions includes youth health behaviors and conditions, substance use behaviors, and student experiences.
The goal of this initial EDA is to understand the dataset and the patterns as well as the possible co-occurence of health risk behaviors.
Apart from two free text entry variables recording the respondent's height in meters (q6) and weight in kilograms (q7) that are also used to calculate the Bosy Mass Index (BMI) in the formula: BMI = Weight/Height^2, most of the survey questions require respondents to select from categorized choices defining the extent of the described situation in the question. There are 20103 entries collected for 2023. More on data edits and the detailed code book can be found in the User Guide for 2023 YRBS.
Changes to note from the 2021 national dataset is that substance use weighted percentage has not change significantly (by 0.03%) within the 2 years, mental health related problems have been alleviated by a small scale as we can see that the rate of respondents that have seriously considered attempting suicide decreased from 22.2% to 20.4%. We will continue to investigate the changes between survey result distributions implement this project.
The national dataset does not have state nor region identifiers because the national samples are not constructed to provide representative data at state or region levels.

In [6]:
file_path = "XXHq_20223.xlsx"

# Read the Excel file
df = pd.read_excel(file_path, engine="openpyxl")
df.drop(columns = ['orig_rec'], inplace = True)
In [7]:
print(df.head())
  site  raceeth q6orig q7orig  record   q1   q2   q3   q4     q5  ...  q102  \
0   XX      NaN    505    180       1  3.0  1.0  1.0  NaN      C  ...   2.0   
1   XX      5.0    N N    233       2  4.0  2.0  1.0  2.0      E  ...   2.0   
2   XX      5.0    506    165       3  5.0  2.0  3.0  2.0      E  ...   1.0   
3   XX      5.0    N N    105       4  6.0  1.0  2.0  2.0      E  ...   2.0   
4   XX      5.0    601    125       5  3.0  2.0  1.0  2.0      E  ...   2.0   

   q103  q104  q105  q106  q107     BMIPCT  weight  stratum    psu  
0   3.0   3.0   2.0   2.0   2.0  97.083855  0.8614      103  16294  
1   2.0   5.0   2.0   1.0   2.0        NaN  0.8920      103  16294  
2   2.0   4.0   2.0   2.0   1.0  92.262516  0.5081      103  16294  
3   4.0   5.0   2.0   1.0   1.0        NaN  1.1759      103  16294  
4   2.0   5.0   2.0   2.0   1.0   7.567053  0.8920      103  16294  

[5 rows x 116 columns]
In [8]:
# Drop columns Q88 to Q107 for not being in standard survey and too much missing values which may not contribute to the project
columns_to_drop = [f"q{num}" for num in range(88, 108)]  # Generates a list of q88 to q107
df = df.drop(columns=columns_to_drop, errors='ignore')  # Ignore errors if column doesn't exist
In [9]:
# Drop columns "q6orig" and "q7orig" for we have the calculated response stored for q6 and q7, so we no longer need the original record.
# Drop not meaningful column "site" as it originally represents the geographic location of the school or district where the survey data is collected, but was masked from the national data sample.
df = df.drop(columns=["q6orig", "q7orig", "site"], errors='ignore')
In [10]:
print(df.head())
   raceeth  record   q1   q2   q3   q4     q5    q6     q7   q8  ...  q82  \
0      NaN       1  3.0  1.0  1.0  NaN      C  1.65  81.65  4.0  ...  2.0   
1      5.0       2  4.0  2.0  1.0  2.0      E   NaN    NaN  5.0  ...  2.0   
2      5.0       3  5.0  2.0  3.0  2.0      E  1.68  74.84  5.0  ...  2.0   
3      5.0       4  6.0  1.0  2.0  2.0      E   NaN    NaN  4.0  ...  2.0   
4      5.0       5  3.0  2.0  1.0  2.0      E  1.85  56.70  5.0  ...  2.0   

   q83  q84  q85  q86  q87     BMIPCT  weight  stratum    psu  
0  1.0  1.0  3.0  1.0  NaN  97.083855  0.8614      103  16294  
1  2.0  3.0  5.0  1.0  4.0        NaN  0.8920      103  16294  
2  1.0  2.0  1.0  1.0  3.0  92.262516  0.5081      103  16294  
3  1.0  3.0  4.0  1.0  3.0        NaN  1.1759      103  16294  
4  1.0  3.0  3.0  1.0  4.0   7.567053  0.8920      103  16294  

[5 rows x 93 columns]
In [11]:
print(df.columns.tolist())
print(df.info())
['raceeth', 'record', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', 'q21', 'q22', 'q23', 'q24', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39', 'q40', 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'BMIPCT', 'weight', 'stratum', 'psu']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20103 entries, 0 to 20102
Data columns (total 93 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   raceeth  19733 non-null  float64
 1   record   20103 non-null  int64  
 2   q1       20005 non-null  float64
 3   q2       19945 non-null  float64
 4   q3       19910 non-null  float64
 5   q4       19849 non-null  float64
 6   q5       18714 non-null  object 
 7   q6       17814 non-null  float64
 8   q7       17814 non-null  float64
 9   q8       15071 non-null  float64
 10  q9       19598 non-null  float64
 11  q10      18976 non-null  float64
 12  q11      17865 non-null  float64
 13  q12      19595 non-null  float64
 14  q13      11374 non-null  float64
 15  q14      18982 non-null  float64
 16  q15      17974 non-null  float64
 17  q16      11652 non-null  float64
 18  q17      17460 non-null  float64
 19  q18      14577 non-null  float64
 20  q19      17302 non-null  float64
 21  q20      15752 non-null  float64
 22  q21      15599 non-null  float64
 23  q22      19266 non-null  float64
 24  q23      18286 non-null  float64
 25  q24      19902 non-null  float64
 26  q25      19898 non-null  float64
 27  q26      19863 non-null  float64
 28  q27      19667 non-null  float64
 29  q28      18366 non-null  float64
 30  q29      19336 non-null  float64
 31  q30      14635 non-null  float64
 32  q31      17442 non-null  float64
 33  q32      16820 non-null  float64
 34  q33      19739 non-null  float64
 35  q34      12157 non-null  float64
 36  q35      19804 non-null  float64
 37  q36      19091 non-null  float64
 38  q37      13925 non-null  float64
 39  q38      18923 non-null  float64
 40  q39      18942 non-null  float64
 41  q40      15210 non-null  float64
 42  q41      19248 non-null  float64
 43  q42      19202 non-null  float64
 44  q43      15441 non-null  float64
 45  q44      14414 non-null  float64
 46  q45      14305 non-null  float64
 47  q46      16603 non-null  float64
 48  q47      19239 non-null  float64
 49  q48      19600 non-null  float64
 50  q49      19527 non-null  float64
 51  q50      16847 non-null  float64
 52  q51      14226 non-null  float64
 53  q52      19322 non-null  float64
 54  q53      18981 non-null  float64
 55  q54      16495 non-null  float64
 56  q55      15097 non-null  float64
 57  q56      16121 non-null  float64
 58  q57      18573 non-null  float64
 59  q58      17645 non-null  float64
 60  q59      18339 non-null  float64
 61  q60      16448 non-null  float64
 62  q61      18312 non-null  float64
 63  q62      17944 non-null  float64
 64  q63      15381 non-null  float64
 65  q64      18100 non-null  float64
 66  q65      18301 non-null  float64
 67  q66      11795 non-null  float64
 68  q67      13936 non-null  float64
 69  q68      16608 non-null  float64
 70  q69      19027 non-null  float64
 71  q70      15336 non-null  float64
 72  q71      15245 non-null  float64
 73  q72      15276 non-null  float64
 74  q73      15212 non-null  float64
 75  q74      15183 non-null  float64
 76  q75      14268 non-null  float64
 77  q76      18876 non-null  float64
 78  q77      13876 non-null  float64
 79  q78      12964 non-null  float64
 80  q79      15509 non-null  float64
 81  q80      15203 non-null  float64
 82  q81      15519 non-null  float64
 83  q82      12778 non-null  float64
 84  q83      17924 non-null  float64
 85  q84      15705 non-null  float64
 86  q85      17441 non-null  float64
 87  q86      14541 non-null  float64
 88  q87      16535 non-null  float64
 89  BMIPCT   17814 non-null  float64
 90  weight   20103 non-null  float64
 91  stratum  20103 non-null  int64  
 92  psu      20103 non-null  int64  
dtypes: float64(89), int64(3), object(1)
memory usage: 14.3+ MB
None
In [12]:
print(df.describe())
            raceeth        record            q1            q2            q3  \
count  19733.000000  20103.000000  20005.000000  19945.000000  19910.000000   
mean       5.010845  10140.386161      4.892677      1.504437      2.361477   
std        1.832258   5863.836462      1.218455      0.499993      1.101724   
min        1.000000      1.000000      1.000000      1.000000      1.000000   
25%        5.000000   5062.500000      4.000000      1.000000      1.000000   
50%        5.000000  10129.000000      5.000000      2.000000      2.000000   
75%        6.000000  15203.500000      6.000000      2.000000      3.000000   
max        8.000000  20386.000000      7.000000      2.000000      5.000000   

                 q4            q6            q7            q8            q9  \
count  19849.000000  17814.000000  17814.000000  15071.000000  19598.000000   
mean       1.798630      1.696357     68.680140      4.310796      1.324625   
std        0.401034      0.102717     18.466386      0.975215      0.899069   
min        1.000000      1.320000     31.300000      1.000000      1.000000   
25%        2.000000      1.630000     55.790000      4.000000      1.000000   
50%        2.000000      1.700000     64.410000      5.000000      1.000000   
75%        2.000000      1.780000     77.110000      5.000000      1.000000   
max        2.000000      2.030000    180.990000      5.000000      5.000000   

       ...           q82          q83           q84           q85  \
count  ...  12778.000000  17924.00000  15705.000000  17441.000000   
mean   ...      2.005869      1.57755      2.825215      3.472679   
std    ...      0.346559      1.18036      1.217580      1.376641   
min    ...      1.000000      1.00000      1.000000      1.000000   
25%    ...      2.000000      1.00000      2.000000      3.000000   
50%    ...      2.000000      1.00000      3.000000      4.000000   
75%    ...      2.000000      2.00000      4.000000      4.000000   
max    ...      3.000000      5.00000      5.000000      7.000000   

                q86           q87        BMIPCT        weight       stratum  \
count  14541.000000  16535.000000  1.781400e+04  20103.000000  20103.000000   
mean       1.146001      2.206229  6.337997e+01      1.000000    228.370243   
std        0.803579      1.491575  2.931229e+01      1.270645     73.287806   
min        1.000000      1.000000  1.597498e-08      0.002300    101.000000   
25%        1.000000      1.000000  4.149272e+01      0.153200    201.000000   
50%        1.000000      2.000000  6.887086e+01      0.414300    212.000000   
75%        1.000000      3.000000  9.017639e+01      1.359400    300.000000   
max        7.000000      7.000000  9.996278e+01     22.543700    300.000000   

                psu  
count   20103.00000  
mean   414571.68925  
std    235438.54837  
min     16294.00000  
25%    214944.00000  
50%    401089.00000  
75%    573255.00000  
max    788430.00000  

[8 rows x 92 columns]
In [13]:
# missing values
missing_counts_df = df.isnull().sum()
missing_percent_df = df.isnull().mean()
missing_df = pd.DataFrame({'missing_counts': missing_counts_df,
                              'missing_percent': missing_percent_df}).sort_values(
    by='missing_percent', ascending=False)
missing_df[missing_df['missing_percent'] > 0.3]
Out[13]:
missing_counts missing_percent
q13 8729 0.434214
q16 8451 0.420385
q66 8308 0.413272
q34 7946 0.395264
q82 7325 0.364373
q78 7139 0.355121
q77 6227 0.309755
q37 6178 0.307317
q67 6167 0.306770

Questions with the highest missing percentages (>30%) are:
Q13 - During the past 12 months, on how many days did you carry a gun?
Q16 - During the past 12 months, how many times were you in a physical fight?
Q66 - How do you describe your weight?
Q34 - During the past 30 days, on the days you smoked, how many cigarettes did you smoke per day?
Q82 - During the past 12 months, have you been tested for a sexuality transmitted disease (STD) other than HIV, such as chlamydia or gonorrhea?
Q78 - During the past 12 months, on how many sports teams did you play?
Q77 - In an average week when you are in school on how many days do you go to physical education (PE) classes?
Q37 - During the past 30 days, how did you usually get your electronic vapor?
Q67 - Which of the following are you trying to do about your weight?

q4 1 q35 1 q31 1 q26 1 q24 1 q25 1 q19 1 q18 1 q56 1 dtype: int64

In [14]:
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Out[14]:
Series([], dtype: int64)
In [15]:
# Define response mappings
response_mappings = {
    "q1": {1: 12, 2: 13, 3: 14, 4: 15, 5: 16, 6: 17, 7: 18},
    "q2": {1: 0, 2: 1},
    "q3": {1: 9, 2: 10, 3: 11, 4: 12, 5: 13}, ## E.Ungraded or other grade is represented by 13
    "q4": {1: 1, 2: 0},
    "q8": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, ## seat belt use, starts from 0=never, to 4=always
    "q9": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q10": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not drive, 1=not drunk driving, 2=drunk drove at least once
    "q11": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3}, # binned, 0=did not drive, 1=not text driving, 2=texted 1-20 days, 3=over 20 days
    "q12": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q13": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q14": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q15": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
    "q16": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
    "q17": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
    "q18": {1: 1, 2: 0},
    "q19": {1: 1, 2: 0},
    "q20": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q21": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not date, 1=0 time, 2=at least once
    "q22": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not date, 1=0 time, 2=at least once
    "q23": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
    "q24": {1: 1, 2: 0},
    "q25": {1: 1, 2: 0},
    "q26": {1: 1, 2: 0},
    "q27": {1: 1, 2: 0},
    "q28": {1: 1, 2: 0},
    "q29": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
    "q30": {1: 2, 2: 1, 3: 0}, # ever injured in suicide, 2=did not attempt, 1=yes, 0=no
    "q31": {1: 1, 2: 0},  # Ever smoked a cigarette (Yes=1, No=0)
    "q32": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17},  # 0=never smoked, Age of first cigarette use (average age for range)
    "q33": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30},  # Days smoked cigarettes in the past 30 days
    "q34": {1: 0, 2: 0.5, 3: 1, 4: 3.5, 5: 8, 6: 15.5, 7: 20}, 
    "q35": {1: 1, 2: 0},  # Ever used electronic vapor products (Yes=1, No=0)
    "q36": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30},  # Days used vapor products in the past 30 days
    "q37": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7},  # Categorical, Source of vapor products (categorical mapping)
    "q38": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30},  # Days used smokeless tobacco in past 30 days
    "q39": {1: 0, 2: 1.5, 3: 4, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30},  # Days smoked cigars in past 30 days
    "q40": {1: 0, 2: 1, 3: 2},  # consider as binned/categorized, Attempted to quit tobacco (Yes=1, No=2, Did not use=0)
    "q41": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17},  # 0=never had a drink, Age of first alcohol use
    "q42": {1: 0, 2: 1.5, 3: 4, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30},  # Days drank alcohol in past 30 days
    "q43": {1: 0, 2: 1, 3: 2, 4: 4, 5: 7.5, 6: 14.5, 7: 20},  # Days engaged in binge drinking
    "q44": {1: 0, 2: 1.5, 3: 3, 4: 4, 5: 5, 6: 6.5, 7: 8.5, 8: 10},  # Largest number of drinks consumed
    "q45": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7},  # Categorical, Source of alcohol
    "q46": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 69.5, 7: 100},  # times used marijuana
    "q47": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17},  # 0=never used, Age of first marijuana use
    "q48": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # times used marijuana in past 30 days
    "q49": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used prescription pain medicine without prescription
    "q50": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used cocaine
    "q51": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used inhalants
    "q52": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used heroin
    "q53": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used methamphetamines
    "q54": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40},  # Ever used ecstasy
    "q55": {1: 0, 2: 1, 3: 2},  # Ever injected any illegal drug
    "q56": {1: 1, 2: 0},  # Ever had sexual intercourse
    "q57": {1: 0, 2: 11, 3: 12, 4: 13, 5: 14, 6: 15, 7: 16, 8: 17},  # 0=never had sex, Age at first sex
    "q58": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6},  # Number of lifetime sexual partners
    "q59": {1: 0, 2: 0.5, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6},  # Number of sexual partners in past 3 months
    "q60": {1: 2, 2: 1, 3: 0},  # Categorical. Drank alcohol/used drugs before last sex (Yes=1, No=0, never had sex=2)
    "q61": {1: 2, 2: 1, 3: 0},  # Categorical. Used a condom last time had sex (Yes=1, No=0), never had sex = 2
    "q62": {1: 7, 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6},  # Categorical. Birth control method used, never had sex = 7
    "q63": {1: 3, 2: 0, 3: 1, 4: 2},  # Categorical. Sex of sexual contacts (never had sex=3, female=0, male=1, both=2)
    "q64": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: pd.NA},  # Categorical. Sexual identity, pd.NA for don't know what the question is asking (equals to missing value)
    "q65": {1: 0, 2: 1, 3: 2, 4: pd.NA},  # Categorical. Transgender identitym pd.NA for response 4 not knowing what the question is asking
    "q66": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4},  # Categorical.  Perception of weight (0 = very underweight, 4 = very overweight)
    "q67": {1: 1, 2: 3, 3: 2, 4: 0},  # Categorical. Weight control behavior (1 = trying to lose, 2= stay the same, 3 = trying to gain, 0 = nothing)
    "q68": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times drank fruit juice in past 7 days
    "q69": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times ate fruit in past 7 days
    "q70": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},   # Times ate green salad in past 7 days
    "q71": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times ate potatoes in past 7 days
    "q72": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times ate carrots in past 7 days
    "q73": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times ate other vegetables in past 7 days
    "q74": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28},  # Times drank soda in past 7 days
    "q75": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7},  # Days ate breakfast in past 7 days
    "q76": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7},  # Days physically active for 60 minutes
    "q77": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5},  # Days attended PE class in an average week
    "q78": {1: 0, 2: 1, 3: 2, 4: 3},  # Number of sports teams played on in past 12 months
    "q79": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4},  # Times had a concussion from sports in past 12 months
    "q80": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7},  # Categocial. Frequency level of social media use
    "q81": {1: 1, 2: 0, 3: 2},  # Categorical. Ever tested for HIV (Yes=1, No=0, Not sure=2)
    "q82": {1: 1, 2: 0, 3: 2},  # Categorical. Tested for STD in past 12 months (Yes=1, No=0, Not sure=2)
    "q83": {1: 0, 2: 1, 3: 2, 4: 3, 5: pd.NA},  # Last dental visit (0=past year, 3=never, pd.NA for not sure)
    "q84": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4},  # Frequency level of poor mental health in past 30 days
    "q85": {1: 4, 2: 5, 3: 6, 4: 7, 5: 8, 6: 9, 7: 10},  # Average sleep hours on school night
    "q86": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6},  # Categorical. Usual place of sleep
    "q87": {1: 4, 2: 3, 3: 2, 4: 1, 5: 0, 6: 5, 7: pd.NA}  # Grades in school (4=A, 3=B, ..., 0=F, 5=none of the grades, pd.NA for notsure)
}


# Apply transformations to Q1 to Q87
for col in df.columns:
    if col.startswith("q") and col[1:].isdigit():  # Ensure it's a question column
        q_number = int(col[1:])
        if q_number <= 87 and col in response_mappings:  # Only process Q1 to Q87
            df[col] = df[col].map(response_mappings[col])

# Save the transformed dataset
transformed_file_path = "Transformed_XXHq_20223.xlsx"
df.to_excel(transformed_file_path, index=False)

print(f"Transformed dataset saved as: {transformed_file_path}")
Transformed dataset saved as: Transformed_XXHq_20223.xlsx
In [16]:
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Out[16]:
Series([], dtype: int64)

Consider the following questions as categorical, as either their first response option is not related to the rest or does not form a meaningful level.
"q10", "q11", "q21", "q22", "q30", "q32", "q37", "q40", "q41", "q45", "q57", "q59", "q60", "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q80", "q81", "q82", "q86"
Consider using the QN dataset for a probit model. Remember to check for normalization assumption.

In [17]:
df = pd.read_excel(transformed_file_path, engine="openpyxl")
In [18]:
# Cramer's V statistic for catagorical association.
excluded_columns = ["q5", "BMIPCT", "q6", "q7", "weight", "stratum", "psu", "record"]
categorical_df = df.drop(columns=excluded_columns, errors='ignore')

# Compute Cramér's V
def cramers_v(x, y):
    """Calculate Cramér's V statistic for categorical-categorical association."""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1))) if n > 0 else np.nan

# For all categorical variables
def cramers_v_matrix(dataframe):
    """Computes Cramér's V for all pairs of categorical variables in a DataFrame."""
    cols = dataframe.columns
    n = len(cols)
    cramers_v_df = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)

    for i in range(n):
        for j in range(i, n):
            if i == j:
                cramers_v_df.iloc[i, j] = 1.0  # Perfect correlation with itself
            else:
                try:
                    v = cramers_v(dataframe[cols[i]], dataframe[cols[j]])
                    cramers_v_df.iloc[i, j] = v
                    cramers_v_df.iloc[j, i] = v  # Symmetric matrix
                except Exception:
                    cramers_v_df.iloc[i, j] = np.nan
                    cramers_v_df.iloc[j, i] = np.nan

    return cramers_v_df

# Compute Cramér's V matrix
cramers_v_results = cramers_v_matrix(categorical_df)
In [19]:
print(cramers_v_results)
# plot heatmap for the Cramer's V statistic
plt.figure(figsize=(20, 20))
sns.heatmap(cramers_v_results, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Cramer's V statistic for categorical variables")
plt.show()
          raceeth        q1        q2        q3        q4        q8        q9  \
raceeth  1.000000  0.044846  0.028691  0.041114  1.000000  0.072129  0.043661   
q1       0.044846  1.000000  0.047639  0.604506  0.048649  0.091398  0.069488   
q2       0.028691  0.047639  1.000000  0.025431  0.014294  0.057296  0.044664   
q3       0.041114  0.604506  0.025431  1.000000  0.037300  0.080089  0.051610   
q4       1.000000  0.048649  0.014294  0.037300  1.000000  0.040144  0.057473   
...           ...       ...       ...       ...       ...       ...       ...   
q83      0.065789  0.083819  0.034040  0.070505  0.041340  0.075665  0.057041   
q84      0.047925  0.037839  0.320959  0.030393  0.025802  0.054634  0.053271   
q85      0.045667  0.052175  0.059903  0.057220  0.014175  0.096911  0.060306   
q86      0.042926  0.119990  0.031433  0.061834  0.047074  0.113816  0.085788   
q87      0.100342  0.041993  0.139030  0.047755  0.147813  0.111825  0.054568   

              q10       q11       q12  ...       q78       q79       q80  \
raceeth  0.127186  0.103859  0.034408  ...  0.068120  0.041916  0.040620   
q1       0.295016  0.251001  0.064292  ...  0.051153  0.049902  0.043308   
q2       0.065098  0.060452  0.077037  ...  0.074276  0.060687  0.136509   
q3       0.264071  0.233007  0.043071  ...  0.050002  0.043780  0.045682   
q4       0.079977  0.075568  0.020578  ...  0.075611  0.036500  0.044324   
...           ...       ...       ...  ...       ...       ...       ...   
q83      0.082599  0.035261  0.057427  ...  0.058835  0.075878  0.052961   
q84      0.061932  0.036719  0.043507  ...  0.071981  0.040012  0.097822   
q85      0.077413  0.046771  0.046831  ...  0.087207  0.054950  0.061799   
q86      0.156038  0.062620  0.096786  ...  0.031774  0.127250  0.091787   
q87      0.089512  0.048363  0.060165  ...  0.094252  0.060909  0.054917   

              q81       q82       q83       q84       q85       q86       q87  
raceeth  0.059174  0.056257  0.065789  0.047925  0.045667  0.042926  0.100342  
q1       0.102338  0.083515  0.083819  0.037839  0.052175  0.119990  0.041993  
q2       0.030828  0.069393  0.034040  0.320959  0.059903  0.031433  0.139030  
q3       0.081917  0.064238  0.070505  0.030393  0.057220  0.061834  0.047755  
q4       0.035587  0.018089  0.041340  0.025802  0.014175  0.047074  0.147813  
...           ...       ...       ...       ...       ...       ...       ...  
q83      0.022376  0.027702  1.000000  0.069054  0.084704  0.146813  0.093364  
q84      0.042790  0.044632  0.069054  1.000000  0.151414  0.054875  0.076286  
q85      0.067809  0.065841  0.084704  0.151414  1.000000  0.062269  0.104456  
q86      0.093209  0.097662  0.146813  0.054875  0.062269  1.000000  0.062734  
q87      0.048236  0.046393  0.093364  0.076286  0.104456  0.062734  1.000000  

[85 rows x 85 columns]
No description has been provided for this image
In [20]:
# Convert Cramér's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
    cramers_v_results
    .stack()
    .reset_index()
    .rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)

# Remove duplicate pairs (keep only upper triangle of matrix)
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]

# Sort by Cramér's V in descending order
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)

# Display sorted associations
print(cramers_v_pairs.head(30))
     Variable 1 Variable 2  Cramers V
4813        q59        q56   1.000000
4728        q58        q56   1.000000
4898        q60        q56   1.000000
4983        q61        q56   1.000000
4561        q56        q59   1.000000
4560        q56        q58   1.000000
4563        q56        q61   1.000000
4562        q56        q60   1.000000
2409        q31        q32   1.000000
2493        q32        q31   1.000000
340          q4    raceeth   1.000000
4       raceeth         q4   1.000000
4643        q57        q56   1.000000
4559        q56        q57   1.000000
5068        q62        q56   0.947679
4564        q56        q62   0.947679
4989        q61        q62   0.823033
5073        q62        q61   0.823033
5153        q63        q56   0.756632
4565        q56        q63   0.756632
687         q11        q10   0.735762
603         q10        q11   0.735762
4732        q58        q60   0.731473
4900        q60        q58   0.731473
4901        q60        q59   0.727589
4817        q59        q60   0.727589
2321        q30        q29   0.721190
2237        q29        q30   0.721190
4648        q57        q61   0.717324
4984        q61        q57   0.717324
In [21]:
# Convert Cramer's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
    cramers_v_results
    .stack()
    .reset_index()
    .rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)

cramers_v_pairs["Pair"] = cramers_v_pairs.apply(lambda x: tuple(sorted([x["Variable 1"], x["Variable 2"]])), axis=1)

# Drop duplicate pairs (keep only unique combinations)
cramers_v_pairs = cramers_v_pairs.drop_duplicates(subset=["Pair"]).drop(columns=["Pair"])

# Remove self-correlations
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]

# Sort in descending order of Cramer's V
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)

# Print the top 30 unique pairs
print(cramers_v_pairs.head(25))
     Variable 1 Variable 2  Cramers V
4       raceeth         q4   1.000000
4560        q56        q58   1.000000
4561        q56        q59   1.000000
4562        q56        q60   1.000000
4563        q56        q61   1.000000
2409        q31        q32   1.000000
4559        q56        q57   1.000000
4564        q56        q62   0.947679
4989        q61        q62   0.823033
4565        q56        q63   0.756632
603         q10        q11   0.735762
4732        q58        q60   0.731473
4817        q59        q60   0.727589
2237        q29        q30   0.721190
4648        q57        q61   0.717324
4647        q57        q60   0.715309
4903        q60        q61   0.714463
4733        q58        q61   0.713312
4818        q59        q61   0.710113
4904        q60        q62   0.677560
2754        q35        q37   0.667169
2753        q35        q36   0.664059
2757        q35        q40   0.658417
2065        q27        q28   0.644388
2763        q35        q46   0.637556

Highly correlated questions as above.
Q61 Q62 0.823 Condom use during last sex vs. Birth control method used last time
Q10 Q11 0.736 Riding with a drinking driver vs. Driving after drinking alcohol
Q58 Q60 0.731 Number of sex partners vs. Alcohol/drug use before last sex
Q59 Q60 0.728 Current sexual activity vs. Alcohol/drug use before last sex
Q29 Q30 0.721 Suicide attempt vs. Suicide attempt requiring medical attention
Q57 Q61 0.717 Age of first sexual intercourse vs. Condom use last time
Q57 Q60 0.715 Age of first sex vs. Alcohol/drug use before last sex
Q60 Q61 0.714 Alcohol/drug use before last sex vs. Condom use last time
Q58 Q61 0.713 Number of sex partners vs. Condom use last time
Q59 Q61 0.710 Current sexual activity vs. Condom use last time
Q60 Q62 0.678 Alcohol/drug use before last sex vs. Birth control method used
Q1 Q3 0.605 Age vs. Grade level
Q2 Q63 0.567 Sex vs. Sex of sexual contacts
Q52 Q55 0.560 Ever used cocaine vs. Ever injected an illegal drug
Q60 Q63 0.540 Alcohol/drug use before last sex vs. Sex of sexual contacts
Q42 Q43 0.539 Current alcohol use vs. Binge drinking
Q36 Q40 0.539 Ever electronic vapor product use vs. Tried quitting all tobacco products
Q33 Q34 0.537 Current cigarette use vs. Heavy cigarette smoking (>10 per day)
Q61 Q63 0.536 Condom use vs. Sex of sexual contacts
Q37 Q40 0.533 Current electronic vapor product use vs. Tried quitting all tobacco products
Q54 Q55 0.516 Ever used ecstasy vs. Ever injected an illegal drug
Q53 Q55 0.514 Ever used methamphetamine vs. Ever injected an illegal drug
Q81 Q82 0.513 Ever tested for HIV vs. Ever tested for STDs
Q20 Q21 0.511 Forced sexual intercourse vs. Dating sexual violence
Q58 Q59 0.492 Number of sex partners vs. Current sexual activity\

Strong association is seen among sexual behaviors and risky practices. Q57 (Age at first sex), Q58 (Number of sex partners), Q59 (Current sexual activity), Q60 (Alcohol/drug use before last sex), Q61 (Condom use), and Q62 (Birth control method used) are highly interrelated. One possible explanation is that individuals with earlier sexual debut (Q57) tend to have more partners (Q58) and are more likely to engage in risky behaviors like substance use before sex (Q60) and not using protection (Q61, Q62). To account for these correlations, we are combining several question reponses to new binary indicators of risky sexual behaviors. This will also help reduce the dimensionality.
q34, q40, q43 and q44 is dropped because of largely missing values and not meaningful implications.

In [22]:
# A few extra feature engineering to account for collinearity.
# Comment out the ones you wish not to create
df["q10_11_unsafe_driving"] = ((df["q10"] > 1) | (df["q11"] > 1)).astype(int)
df["q27_30_suicide_combined"] = ((df["q30"]) != 0 | (df["q29"] != 0) | (df["q28"] != 0) | (df["q27"] != 0)).astype(int)
df["q33_smoking"] = (df["q33"] != 0).astype(int)
df["q48_55_substance_use"] = (df.loc[:, "q48":"q55"] != 0).any(axis=1).astype(int) # q48 to 55 combined as substance_use indicator
df["q57_early_sexual_intercourse"] = (df["q57"] != 0).astype(int) # first sexual intercourse age question transoformed to binary variable maerked as 1 if q57 response is not 0 (never had sex)
df["q60_62_unsafe_sex"] = ((df["q60"] == 0) | (df["q61"] == 0) | (df["q62"].isin([0, 5]))).astype(int) # as long as one of the related question matches, this variable will be 1
df["q68_73_healthy_diet"] = (df.loc[:, "q68":"q73"].ne(0).sum(axis=1) >= 3).astype(int) # healthy_diet = 1 if at least 3 of q68-73 are not 0
df["q81_82_std_checked"] = ((df["q81"] == 1) | (df["q82"] == 1)).astype(int)
df["suicide_attempt"] = (df["q29"] != 0).astype(int)
In [23]:
## storing possible target variable, choose as you wish. Raw questions are not one hot encoded yet.
q27_raw = df["q27"]
q28_raw = df["q28"]
q29_raw = df["q29"]
q30_raw = df["q30"]
suicide_q29_attempt = df["suicide_attempt"]
suicide_combined_q = df["q27_30_suicide_combined"]
In [24]:
df = df.drop(columns=["q10", "q11", "q27", "q28", "q29", "q30", "q33", "q48", "q49", "q50", "q51", "q52", "q53", "q54", 
                      "q55", "q60", "q61", "q62", "q68", "q69", "q70", "q71", "q72", "q73", "q81", "q82", "suicide_combined", "psu", "stratum", "record"], 
                      errors='ignore'
                      )
In [25]:
# renaming for clarity, add more if you would like to.
df = df.rename(columns={'q1': 'q1_age',
                        'q2': 'q2_gender',
                        'q3': 'q3_grade',
                        'q4': 'q4_ethnicity',
                        'q5': 'q5_race',
                        'q6': 'q6_height',
                        'q7': 'q7_weight',
                        'q8': 'q8_seatbelt_riding',
                        'q9': 'q9_ride_w_drunk_drinver', 
                        'q12': 'q12_days_carrying_weapon_atschool',
                        'q13': 'q13_days_carrying_gun',
                        'q14': 'q14_skipped_school_bc_insecure',
                        'q15': 'q15_been_threatened_at_school',
                        'q16': 'q16_12mo_physical_fight',
                        'q17': 'q17_12mo_physicalf_school',
                        'q18': 'q18_seen_vio_neighborhood',
                        'q19': 'q19_forced_sex',
                        'q20': 'q20_forced_sexual',
                        'q21': 'q21_dating_unwilling_sexual',
                        'q22': 'q22_dating_physical_hurting',
                        'q23': 'q23_discriminated',
                        'q24': 'q24_been_bullied',
                        'q25': 'q25_electronically_bullied',
                        'q26': 'q26_depression',
                        'q31': 'q31_ever_smoked',
                        'q32': 'q32_first_smoke_age',
                        'q34': 'q34_smoke_count_daily',
                        'q35': 'q35_ever_vaped',
                        'q36': 'q36_days_vaped_30d',
                        'q37': 'q37_vapor_source',
                        'q38': 'q38_days_chewed_30d',
                        'q39': 'q39_days_smoked_30d',
                        'q40': 'q40_quit_tabacco_12mo',
                        'q41': 'q41_first_drink_age',
                        'q42': 'q42_days_drank_30d',
                        'q43': 'q43_days_binge_drink_30d',
                        'q44': 'q44_max_drinks_30d',
                        'q45': 'q45_alcohol_source',
                        'q46': 'q46_times_marijuana',
                        'q47': 'q47_first_marijuana_age',
                        'q56': 'q56_early_sex', 
                        'q57': 'q57_age_first_sex',
                        'q58': 'q58_sexual_partners_count',
                        'q59': 'q59_sex_count_3mo',
                        'q63': 'q63_sex_contact_gender', 
                        'q64': 'q64_sexual_identity',
                        'q65': 'q65_transgender', 
                        'q66': 'q66_weight_perception', 
                        'q67': 'q67_weight_plans',
                        'q74': 'q74_soda_count',
                        'q75': 'q75_days_breakfast_1w',
                        'q76': 'q76_days_physically_active_1w',
                        'q77': 'q77_days_PE_class_1w',
                        'q78': 'q78_sports_team_count_12mo',
                        'q79': 'q79_concussion_count_12mo',
                        'q80': 'q80_social_media_freq',
                        'q83': 'q83_last_dentist',
                        'q84': 'q84_mental_health_not_good',
                        'q85': 'q85_workdays_average_sleep',
                        'q86': 'q86_unstable_housing',
                        'q87': 'q87_academic_grades'
                        }
                        )   
In [27]:
df = df.drop(columns=['q4_ethnicity', 'q5_race'], 
                      errors='ignore'
                      )
In [28]:
print(df.head(5))
   raceeth  q1_age  q2_gender  q3_grade  q6_height  q7_weight  \
0      NaN    14.0        0.0       9.0       1.65      81.65   
1      5.0    15.0        1.0       9.0        NaN        NaN   
2      5.0    16.0        1.0      11.0       1.68      74.84   
3      5.0    17.0        0.0      10.0        NaN        NaN   
4      5.0    14.0        1.0       9.0       1.85      56.70   

   q8_seatbelt_riding  q9_ride_w_drunk_drinver  \
0                 3.0                      4.5   
1                 4.0                      0.0   
2                 4.0                      2.5   
3                 3.0                      0.0   
4                 4.0                      0.0   

   q12_days_carrying_weapon_atschool  q13_days_carrying_gun  ...  weight  \
0                                0.0                    0.0  ...  0.8614   
1                                0.0                    0.0  ...  0.8920   
2                                0.0                    0.0  ...  0.5081   
3                                0.0                    0.0  ...  1.1759   
4                                0.0                    0.0  ...  0.8920   

   q10_11_unsafe_driving  q27_30_suicide_combined  q33_smoking  \
0                      0                        1            0   
1                      0                        1            0   
2                      1                        1            0   
3                      1                        1            0   
4                      0                        1            0   

   q48_55_substance_use  q57_early_sexual_intercourse  q60_62_unsafe_sex  \
0                     1                             0                  0   
1                     0                             0                  0   
2                     1                             1                  1   
3                     1                             1                  1   
4                     0                             1                  1   

   q68_73_healthy_diet  q81_82_std_checked  suicide_attempt  
0                    0                   0                0  
1                    0                   0                0  
2                    1                   0                0  
3                    0                   0                0  
4                    1                   0                0  

[5 rows x 71 columns]

STOP HERE¶

Cramer's V after rename, feature creation and combine.¶

In [36]:
# Cramer's V statistic for catagorical association.
excluded_columns = ["BMIPCT", "q6_height", "q7_weight", "weight"]
categorical_df = df.drop(columns=excluded_columns, errors='ignore')

# Compute Cramér's V
def cramers_v(x, y):
    """Calculate Cramér's V statistic for categorical-categorical association."""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1))) if n > 0 else np.nan

# For all categorical variables
def cramers_v_matrix(dataframe):
    """Computes Cramér's V for all pairs of categorical variables in a DataFrame."""
    cols = dataframe.columns
    n = len(cols)
    cramers_v_df = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)

    for i in range(n):
        for j in range(i, n):
            if i == j:
                cramers_v_df.iloc[i, j] = 1.0  # Perfect correlation with itself
            else:
                try:
                    v = cramers_v(dataframe[cols[i]], dataframe[cols[j]])
                    cramers_v_df.iloc[i, j] = v
                    cramers_v_df.iloc[j, i] = v  # Symmetric matrix
                except Exception:
                    cramers_v_df.iloc[i, j] = np.nan
                    cramers_v_df.iloc[j, i] = np.nan

    return cramers_v_df

# Compute Cramér's V matrix
cramers_v_results = cramers_v_matrix(categorical_df)
In [ ]:
# Convert Cramer's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
    cramers_v_results
    .stack()
    .reset_index()
    .rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)

cramers_v_pairs["Pair"] = cramers_v_pairs.apply(lambda x: tuple(sorted([x["Variable 1"], x["Variable 2"]])), axis=1)

# Drop duplicate pairs (keep only unique combinations)
cramers_v_pairs = cramers_v_pairs.drop_duplicates(subset=["Pair"]).drop(columns=["Pair"])

# Remove self-correlations
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]

# Sort in descending order
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)

print(cramers_v_pairs.head(25))
                        Variable 1                    Variable 2  Cramers V
2518                 q56_early_sex     q58_sexual_partners_count   1.000000
2519                 q56_early_sex             q59_sex_count_3mo   1.000000
1429               q31_ever_smoked           q32_first_smoke_age   1.000000
2608             q57_age_first_sex  q57_early_sexual_intercourse   1.000000
2517                 q56_early_sex             q57_age_first_sex   1.000000
2675     q58_sexual_partners_count  q57_early_sexual_intercourse   0.996213
1601         q34_smoke_count_daily                   q33_smoking   0.994110
2742             q59_sex_count_3mo  q57_early_sexual_intercourse   0.983372
2541                 q56_early_sex  q57_early_sexual_intercourse   0.979905
2542                 q56_early_sex             q60_62_unsafe_sex   0.944343
2743             q59_sex_count_3mo             q60_62_unsafe_sex   0.916442
2609             q57_age_first_sex             q60_62_unsafe_sex   0.913534
2676     q58_sexual_partners_count             q60_62_unsafe_sex   0.913521
4217  q57_early_sexual_intercourse             q60_62_unsafe_sex   0.786396
2520                 q56_early_sex        q63_sex_contact_gender   0.756632
2809        q63_sex_contact_gender  q57_early_sexual_intercourse   0.756523
2810        q63_sex_contact_gender             q60_62_unsafe_sex   0.670078
1634                q35_ever_vaped              q37_vapor_source   0.667169
1633                q35_ever_vaped            q36_days_vaped_30d   0.664059
1637                q35_ever_vaped         q40_quit_tabacco_12mo   0.658417
1643                q35_ever_vaped           q46_times_marijuana   0.637556
1644                q35_ever_vaped       q47_first_marijuana_age   0.636909
70                          q1_age                      q3_grade   0.604506
175                      q2_gender        q63_sex_contact_gender   0.567337
2109            q42_days_drank_30d      q43_days_binge_drink_30d   0.539460
In [ ]:
# Define categorical variables to analyze (excluding the target variable itself)
columns = [col for col in df.columns if df[col].nunique() <= 15]  # Filter categorical vars

n_cols = 2  
n_rows = -(-len(columns) // n_cols)  
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
axes = axes.flatten() 

for i, col in enumerate(columns):
    df[col].value_counts(normalize=True).plot(kind='bar', colormap='viridis', ax=axes[i])
    
    # Formatting
    axes[i].set_title(f'Distribution of {col}', fontsize=14)
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Proportion', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45)  

# Remove empty subplots if there are any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
group_var = "suicide_attempt"

# Define categorical variables to analyze (excluding the target variable itself)
columns = [col for col in df.columns if col != group_var and df[col].nunique() <= 15]  # Filter categorical vars

n_cols = 2 
n_rows = -(-len(columns) // n_cols) 
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
axes = axes.flatten()  

for i, col in enumerate(columns):
    crosstab = pd.crosstab(df[col], df[group_var], normalize='index') * 100 
    
    # Plot as stacked bar chart
    crosstab.plot(kind='bar', stacked=True, colormap='viridis', ax=axes[i])
    
    # Formatting
    axes[i].set_title(f'{col} vs {group_var}', fontsize=14)
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Percentage', fontsize=12)
    axes[i].legend(title=group_var, loc='upper right')
    axes[i].tick_params(axis='x', rotation=45) 

# Remove empty subplots if there are any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [41]:
# List of numerical columns to plot
numerical_columns = ['BMIPCT', 'q6_height', 'q7_weight']

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 10))  # 1 row, 3 columns

for i, col in enumerate(numerical_columns):
    sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")

    # Formatting
    axes[i].set_title(f'Distribution of {col} by Suicide Attempts', fontsize=14)
    axes[i].set_xlabel('Suicide Attempts', fontsize=12)
    axes[i].set_ylabel(col, fontsize=12)

plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")
No description has been provided for this image
In [42]:
# List of categorical columns to plot
columns = ['suicide_attempt', 'raceeth'] 

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(5, 10))  
axes = axes.flatten() 

for i, col in enumerate(columns):
    value_counts = df[col].value_counts(normalize=True) * 100  
    sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis")
    
    # Formatting
    axes[i].set_title(f'Distribution of {col} (%)', fontsize=14)
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Percentage', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45) 

plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\770245347.py:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis")
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\770245347.py:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis")
No description has been provided for this image
In [43]:
column = 'raceeth'
# Define the legend mapping
raceeth_legend = {
    1: "Indian/Alaskan Native",
    2: "Asian",
    3: "Black or African American",
    4: "Native Hawaiian/Other PI",
    5: "White",
    6: "Hispanic/Latino",
    7: "Multiple - Hispanic",
    8: "Multiple - Non-Hispanic"
}

# Calculate value counts and normalize to percentages
value_counts = df[column].value_counts(normalize=True) * 100

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")

# Formatting
plt.title(f'Distribution of {column} (%)', fontsize=14)
plt.xlabel("Race/Ethnicity", fontsize=12)
plt.ylabel('Percentage', fontsize=12)
# plt.xticks(ticks=value_counts.index, labels=[raceeth_legend[val] for val in value_counts.index], rotation=45)

# Add legend
legend_labels = [f"{key} = {value}" for key, value in raceeth_legend.items() if key in value_counts.index]
plt.legend(legend_labels, title="Race/Ethnicity", loc='upper right', fontsize=10)

# Show plot
plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\1541197035.py:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")
No description has been provided for this image
In [44]:
print(df.shape)
(20103, 71)

EDA for PRESENTATION

In [45]:
df['suicide_attempt'].value_counts()
Out[45]:
suicide_attempt
0    17341
1     2762
Name: count, dtype: int64
In [46]:
df['raceeth'].value_counts(normalize=True)*100
Out[46]:
raceeth
5.0    49.156236
7.0    14.118482
8.0     9.192723
3.0     9.076167
1.0     6.760249
6.0     6.121725
2.0     5.042315
4.0     0.532104
Name: proportion, dtype: float64
In [47]:
df['raceeth'].value_counts(normalize=True)*100
Out[47]:
raceeth
5.0    49.156236
7.0    14.118482
8.0     9.192723
3.0     9.076167
1.0     6.760249
6.0     6.121725
2.0     5.042315
4.0     0.532104
Name: proportion, dtype: float64

ONE LAST THING
Depending on how you want to impute the missing data, either dropping the not important columns or impute with decision trees. I will leave that to you.

In [48]:
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Out[48]:
Series([], dtype: int64)
In [49]:
df = df.drop(columns=cols_to_drop.index, errors='ignore')
df.shape
Out[49]:
(20103, 71)
In [50]:
# missing values
missing_counts_df = df.isnull().sum()
missing_percent_df = df.isnull().mean()
missing_df = pd.DataFrame({'missing_counts': missing_counts_df,
                              'missing_percent': missing_percent_df}).sort_values(
    by='missing_percent', ascending=False)
missing_df[missing_df['missing_percent'] > 0]
Out[50]:
missing_counts missing_percent
q13_days_carrying_gun 8729 0.434214
q16_12mo_physical_fight 8451 0.420385
q66_weight_perception 8308 0.413272
q34_smoke_count_daily 7946 0.395264
q78_sports_team_count_12mo 7139 0.355121
... ... ...
q25_electronically_bullied 205 0.010197
q24_been_bullied 201 0.009999
q3_grade 193 0.009601
q2_gender 158 0.007860
q1_age 98 0.004875

61 rows × 2 columns

In [51]:
df[missing_df.index[:5]].describe()
Out[51]:
q13_days_carrying_gun q16_12mo_physical_fight q66_weight_perception q34_smoke_count_daily q78_sports_team_count_12mo
count 11374.000000 11652.000000 11795.000000 12157.000000 12964.000000
mean 0.157464 0.575781 2.155320 0.142387 0.922632
std 0.854565 1.745885 0.904338 1.325005 1.065088
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 2.000000 0.000000 0.000000
50% 0.000000 0.000000 2.000000 0.000000 1.000000
75% 0.000000 0.000000 3.000000 0.000000 2.000000
max 6.000000 12.000000 4.000000 20.000000 3.000000

impute missing values using decision tree regressor.

In [52]:
# impute missing values using decision tree regressor
# get the columns with missing values greater than 0
cols = missing_df[missing_df['missing_percent'] > 0].index.tolist()

for col in cols:
    print(f"Imputing missing values for column: {col}")
    df_hidden = df.copy()
    # impute missing values with Mode for columns other than 'col' in test_data 
    for s in df.columns.difference([col]):
        df_hidden.loc[:, s] = df_hidden[s].fillna(df[s].mode()[0])

    # Split the data into train and test sets
    train_data = df_hidden.dropna()
    test_data = df_hidden[df_hidden[col].isnull()]
    
    X_train = train_data.drop(columns=[col])
    y_train = train_data[col]
    X_test = test_data.drop(columns=[col])
    y_test = test_data[col]
    # Train a decision tree regressor
    tree = DecisionTreeRegressor()
    tree.fit(X_train, y_train)
    # Predict the missing values
    y_pred = tree.predict(X_test)
    # Fill in the missing values
    df.loc[df[col].isnull(), col] = y_pred
Imputing missing values for column: q13_days_carrying_gun
Imputing missing values for column: q16_12mo_physical_fight
Imputing missing values for column: q66_weight_perception
Imputing missing values for column: q34_smoke_count_daily
Imputing missing values for column: q78_sports_team_count_12mo
Imputing missing values for column: q77_days_PE_class_1w
Imputing missing values for column: q37_vapor_source
Imputing missing values for column: q67_weight_plans
Imputing missing values for column: q75_days_breakfast_1w
Imputing missing values for column: q45_alcohol_source
Imputing missing values for column: q44_max_drinks_30d
Imputing missing values for column: q86_unstable_housing
Imputing missing values for column: q18_seen_vio_neighborhood
Imputing missing values for column: q8_seatbelt_riding
Imputing missing values for column: q74_soda_count
Imputing missing values for column: q80_social_media_freq
Imputing missing values for column: q40_quit_tabacco_12mo
Imputing missing values for column: q63_sex_contact_gender
Imputing missing values for column: q43_days_binge_drink_30d
Imputing missing values for column: q79_concussion_count_12mo
Imputing missing values for column: q21_dating_unwilling_sexual
Imputing missing values for column: q84_mental_health_not_good
Imputing missing values for column: q20_forced_sexual
Imputing missing values for column: q87_academic_grades
Imputing missing values for column: q56_early_sex
Imputing missing values for column: q83_last_dentist
Imputing missing values for column: q46_times_marijuana
Imputing missing values for column: q32_first_smoke_age
Imputing missing values for column: q19_forced_sex
Imputing missing values for column: q85_workdays_average_sleep
Imputing missing values for column: q31_ever_smoked
Imputing missing values for column: q17_12mo_physicalf_school
Imputing missing values for column: q64_sexual_identity
Imputing missing values for column: q58_sexual_partners_count
Imputing missing values for column: BMIPCT
Imputing missing values for column: q7_weight
Imputing missing values for column: q6_height
Imputing missing values for column: q15_been_threatened_at_school
Imputing missing values for column: q65_transgender
Imputing missing values for column: q23_discriminated
Imputing missing values for column: q59_sex_count_3mo
Imputing missing values for column: q57_age_first_sex
Imputing missing values for column: q76_days_physically_active_1w
Imputing missing values for column: q38_days_chewed_30d
Imputing missing values for column: q39_days_smoked_30d
Imputing missing values for column: q14_skipped_school_bc_insecure
Imputing missing values for column: q36_days_vaped_30d
Imputing missing values for column: q42_days_drank_30d
Imputing missing values for column: q47_first_marijuana_age
Imputing missing values for column: q41_first_drink_age
Imputing missing values for column: q22_dating_physical_hurting
Imputing missing values for column: q12_days_carrying_weapon_atschool
Imputing missing values for column: q9_ride_w_drunk_drinver
Imputing missing values for column: raceeth
Imputing missing values for column: q35_ever_vaped
Imputing missing values for column: q26_depression
Imputing missing values for column: q25_electronically_bullied
Imputing missing values for column: q24_been_bullied
Imputing missing values for column: q3_grade
Imputing missing values for column: q2_gender
Imputing missing values for column: q1_age
In [53]:
df[cols[:5]].describe()
Out[53]:
q13_days_carrying_gun q16_12mo_physical_fight q66_weight_perception q34_smoke_count_daily q78_sports_team_count_12mo
count 20103.000000 20103.000000 20103.000000 20103.000000 20103.000000
mean 0.141397 0.565836 2.238770 0.200020 0.965925
std 0.802500 1.784646 0.877138 1.688298 1.086537
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 2.000000 0.000000 0.000000
50% 0.000000 0.000000 2.000000 0.000000 1.000000
75% 0.000000 0.000000 3.000000 0.000000 2.000000
max 6.000000 12.000000 4.000000 20.000000 3.000000
In [54]:
df.to_csv("df_imputed_XXHq_20223.csv", index=False)
In [55]:
df.columns[:50]
Out[55]:
Index(['raceeth', 'q1_age', 'q2_gender', 'q3_grade', 'q6_height', 'q7_weight',
       'q8_seatbelt_riding', 'q9_ride_w_drunk_drinver',
       'q12_days_carrying_weapon_atschool', 'q13_days_carrying_gun',
       'q14_skipped_school_bc_insecure', 'q15_been_threatened_at_school',
       'q16_12mo_physical_fight', 'q17_12mo_physicalf_school',
       'q18_seen_vio_neighborhood', 'q19_forced_sex', 'q20_forced_sexual',
       'q21_dating_unwilling_sexual', 'q22_dating_physical_hurting',
       'q23_discriminated', 'q24_been_bullied', 'q25_electronically_bullied',
       'q26_depression', 'q31_ever_smoked', 'q32_first_smoke_age',
       'q34_smoke_count_daily', 'q35_ever_vaped', 'q36_days_vaped_30d',
       'q37_vapor_source', 'q38_days_chewed_30d', 'q39_days_smoked_30d',
       'q40_quit_tabacco_12mo', 'q41_first_drink_age', 'q42_days_drank_30d',
       'q43_days_binge_drink_30d', 'q44_max_drinks_30d', 'q45_alcohol_source',
       'q46_times_marijuana', 'q47_first_marijuana_age', 'q56_early_sex',
       'q57_age_first_sex', 'q58_sexual_partners_count', 'q59_sex_count_3mo',
       'q63_sex_contact_gender', 'q64_sexual_identity', 'q65_transgender',
       'q66_weight_perception', 'q67_weight_plans', 'q74_soda_count',
       'q75_days_breakfast_1w'],
      dtype='object')
In [56]:
df.columns[50:100]
Out[56]:
Index(['q76_days_physically_active_1w', 'q77_days_PE_class_1w',
       'q78_sports_team_count_12mo', 'q79_concussion_count_12mo',
       'q80_social_media_freq', 'q83_last_dentist',
       'q84_mental_health_not_good', 'q85_workdays_average_sleep',
       'q86_unstable_housing', 'q87_academic_grades', 'BMIPCT', 'weight',
       'q10_11_unsafe_driving', 'q27_30_suicide_combined', 'q33_smoking',
       'q48_55_substance_use', 'q57_early_sexual_intercourse',
       'q60_62_unsafe_sex', 'q68_73_healthy_diet', 'q81_82_std_checked',
       'suicide_attempt'],
      dtype='object')